folder structure

+– 2020-google-user-study.Rproj +– plots <- folder for final and polished charts and maps +– proc_data <- folder for all data that has been processed or saved from the raw files +– raw_data <- folder for raw data files +– README.md -- rmd +– analysis.Rmd <- file for modeling data and polishing charts -- final.Rmd <- final file with polished graphs and results description

if (!requireNamespace("waffle")) install.packages("waffle")
## packages
library(tidyverse)
library(tidytext)
library(colorspace)
library(sf)
library(maps)
library(albersusa) # remotes::install_github("hrbrmstr/albersusa")
library(rgeocodio) # remotes::install_github("hrbrmstr/rgeocodio")
library(geojsonio)
library(rgeos)
library(ggwordcloud)
library(patchwork)
library(pdftools)
library(showtext)

## save plots?
save <- TRUE
#save <- FALSE

## quality of png's
dpi <- 750

## font
font_add_google("Montserrat", "Montserrat")
font_add_google("Overpass", "Overpass")
font_add_google("Overpass Mono", "Overpass Mono")

## theme updates
theme_set(ggthemes::theme_clean(base_size = 15, base_family = "Montserrat"))

theme_update(plot.margin = margin(30, 30, 30, 30),
             plot.background = element_rect(color = "white",
                                            fill = "white"),
             plot.title = element_text(size = 22,
                                       face = "bold",
                                       lineheight = 1.05,
                                       hjust = .5,
                                       margin = margin(10, 0, 25, 0)),
             plot.title.position = "plot",
             plot.caption = element_text(color = "grey40", 
                                         size = 9,
                                         margin = margin(20, 0, -20, 0)),
             plot.caption.position = "plot",
             axis.line.x = element_line(color = "black",
                                        size = .8),
             axis.line.y = element_line(color = "black",
                                        size = .8),
             axis.title.x = element_text(size = 16,
                                         face = "bold",
                                         margin = margin(t = 20)),
             axis.title.y = element_text(size = 16,
                                         face = "bold",
                                         margin = margin(r = 20)),
             axis.text = element_text(size = 11,
                                      color = "black",
                                      face = "bold"),
             axis.text.x = element_text(margin = margin(t = 10)),
             axis.text.y = element_text(margin = margin(r = 10)),
             axis.ticks = element_blank(),
             panel.grid.major.x = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.major.y = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.minor.x = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.minor.y = element_blank(),
             panel.spacing.x = unit(4, "lines"),
             panel.spacing.y = unit(2, "lines"),
             legend.position = "top",
             legend.title = element_text(family = "Montserrat",
                                         color = "black",
                                         size = 14,
                                         margin = margin(5, 0, 5, 0)),
             legend.text = element_text(family = "Montserrat",
                                        color = "black",
                                        size = 11,
                                        margin = margin(4.5, 4.5, 4.5, 4.5)),
             legend.background = element_rect(fill = NA,
                                              color = NA),
             legend.key = element_rect(color = NA, fill = NA),
             #legend.key.width = unit(5, "lines"),
             #legend.spacing.x = unit(.05, "pt"),
             #legend.spacing.y = unit(.55, "pt"),
             #legend.margin = margin(0, 0, 10, 0),
             strip.text = element_text(face = "bold",
                                       margin = margin(b = 10)))

## theme settings for flipped plots
theme_flip <-
  theme(panel.grid.minor.x = element_blank(),
        panel.grid.minor.y = element_line(size = .6,
                                          color = "#eaeaea"))

## theme settings for maps
theme_map <- 
  theme_void(base_family = "Montserrat") +
  theme(legend.direction = "horizontal",
        legend.box = "horizontal",
        legend.margin = margin(10, 10, 10, 10),
        legend.title = element_text(size = 17, 
                                    face = "bold"),
        legend.text = element_text(color = "grey33",
                                   size = 12),
        plot.margin = margin(15, 5, 15, 5),
        plot.title = element_text(face = "bold",
                                  size = 20,
                                  hjust = .5,
                                  margin = margin(30, 0, 10, 0)),
        plot.subtitle = element_text(face = "bold",
                                     color = "grey33",
                                     size = 17,
                                     hjust = .5,
                                     margin = margin(10, 0, -30, 0)),
        plot.caption = element_text(size = 14,
                                    color = "grey33",
                                    hjust = .97,
                                    margin = margin(-30, 0, 0, 0)))

## numeric format for labels
num_format <- scales::format_format(big.mark = ",", small.mark = ",", scientific = F)

## main color backlinko
bl_col <- "#00d188"

## colors + labels for interval stripes
int_cols <- c("#bce2d5", "#79d8b6", bl_col, "#009f66", "#006c45", "#003925")
int_perc <- c("100%", "95%", "75%", "50%", "25%", "5%")

## colors for degrees (Bachelors, Massters, Doctorate in reverse order)
cols_degree <- c("#e64500", "#FFCC00", darken(bl_col, .1))

## gradient colors for position
colfunc <- colorRampPalette(c(bl_col, "#bce2d5"))
pos_cols <- colfunc(10)
#saving plots function example

if (save == T){
  ggsave(here::here("plots", "1_1_jobs_word.pdf"), width = 12, height = 11, device = cairo_pdf)

0. Data

Read our fetched data from google sheets

sheets <- readRDS(here::here("proc_data/fetched_data.RDS"))

tidy the data into one clean table

Check consistency of column names

name_summary <- 
  sheets %>% 
  map_dfr(~ tibble(nm = colnames(.)), .id = "sheet_name") %>%
  spread(nm, nm) %>%
  mutate_at(-1, ~ !is.na(.))

inconsistent_cols <- Filter(Negate(all), name_summary[-1])

# show columns that are not on every question
inconsistent_cols
## # A tibble: 7 x 5
##   ...19 `Clicks on orga~ `Clicks on orga~ `Number of diff~ `Number of page~
##   <lgl> <lgl>            <lgl>            <lgl>            <lgl>           
## 1 TRUE  TRUE             FALSE            TRUE             FALSE           
## 2 TRUE  FALSE            TRUE             TRUE             FALSE           
## 3 FALSE FALSE            TRUE             FALSE            TRUE            
## 4 FALSE TRUE             FALSE            FALSE            TRUE            
## 5 FALSE FALSE            TRUE             FALSE            TRUE            
## 6 TRUE  TRUE             FALSE            FALSE            TRUE            
## 7 FALSE TRUE             FALSE            FALSE            TRUE

We need to fix typos and harmonise some names, we also remove the col “..19” as it’s just an unnamed empty column.

for (nm in names(sheets)) {
  sheets[[nm]][["...19"]] <- NULL
  names(sheets[[nm]]) <- sub("organis ", "organic ", names(sheets[[nm]]))
  names(sheets[[nm]]) <- sub("different ULRs/pages", "pages", names(sheets[[nm]]))
}

# now we try again
name_summary <- 
  sheets %>% 
  map_dfr(~ tibble(nm = colnames(.)), .id = "sheet_name") %>%
  spread(nm, nm) %>%
  mutate_at(-1, ~ !is.na(.))

inconsistent_cols <- Filter(Negate(all), name_summary[-1])
if (ncol(inconsistent_cols)) stop("we still have inconsistent column names")

Now that we have consistent column names, we must ensure we have consistent type.

type_summary <- 
  sheets %>% 
  map_dfr(summarize_all, typeof, .id = "sheet_name")

inconsistent_cols <- Filter(function(x) n_distinct(x) != 1, type_summary[-1])

if (ncol(inconsistent_cols)) stop("we have inconsistent columns types")

We can now concatenate the data of all questions in a single table

full_data <- bind_rows(sheets, .id = "question")
# remove row containing only NAs appart from é first columns
full_data <- full_data[rowSums(is.na(full_data)) != ncol(full_data) - 2,]
# convert Y/N/y/n to TRUE/FALSE
full_data <- modify_if(full_data, ~all(tolower(unique(.)) %in% c("y","n")), ~ tolower(.) == "y")

We observe that item “If clicked on Google Maps/local listing, did they click on one of the first three listings?” was assigned “N” when it is not relevant (no click on google map listing) so we fix by setting those to NA.

full_data[["If clicked on Google Maps/local listing, did they click on one of the first three listings?"]] <- 
  ifelse(full_data[["Clicks on Google Maps/local listings"]],
         full_data[["If clicked on Google Maps/local listing, did they click on one of the first three listings?"]],
         NA)

saveRDS(full_data, here::here("proc_data/clean_data.RDS"))

1. Raw analysis

We go through all items and drive out key insights.

plots <- list()

When typing the search query, did they select one of the suggested searches? (Y/N)

item <- "When typing the search query, did they select one of the suggested searches? (Y/N)"
if (names(full_data)[[3]] != item)
  stop("wrong order of columns")

plots[[item]] <- full_data %>%
  group_by(question) %>%
  summarize_at(item, ~100*mean(.)) %>%
  arrange_at(item) %>%
  mutate(fill = "a") %>%
  bind_rows(summarise_at(., 2, mean) %>% mutate(question = "Overall", fill = "b")) %>%
  mutate(question = factor(question, question)) %>%
  ggplot() +
  geom_col(aes(
    question, 
    !!sym(item),
    fill = fill)) +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 30, hjust = 1)) +
  ggtitle(item) +
  labs(x = "", y = "% of users who selected a suggestion")

plots[[item]]

Seconds to first click (use check time stamping on video)

item <- "Seconds to first click (use check time stamping on video)"
if (names(full_data)[[4]] != item)
  stop("wrong order of columns")

full_data %>%
  group_by_at(4) %>%
  summarize(n = n()) %>%
  ggplot(aes(!!sym(item), n)) +
  geom_point() +
  geom_line() +
  ggtitle("raw chart")

plot_data <- full_data %>%
  group_by_at(4) %>%
  summarize(pct = n()/nrow(.)) %>%
  mutate_at("pct", cumsum) 

# quantile data with labels
q_data = tibble(
    q = quantile(full_data[[item]], c(.25,.5,.75)), 
    x = approx(plot_data$pct, plot_data[[item]], xout = c(.25,.5,.75))$y,
    y = c(.25,.50,.75),
    lab = sprintf("%s%% click before %s sec", y*100, q))

plots[[item]] <- plot_data %>%
  ggplot(aes(!!sym(item), pct))  + 
  geom_line() +
  coord_cartesian(xlim = c(0,50)) + 
  ggtitle("Time to first click") +
  labs(x = "time (seconds)", y = "% of users who click before time") +
  geom_point(aes(x,y), data =q_data, size = 4) +
  geom_text(aes(x,y, label = lab), data = q_data, hjust = 0, nudge_x = 3) +
  scale_y_continuous(labels = function(x) scales::percent(x,1))

plots[[item]]

Did they scroll to the end of the Google Search Page (Y/N)?

item <- "Did they scroll to the end of the Google Search Page (Y/N)?"
if (names(full_data)[[5]] != item)
  stop("wrong order of columns")

plots[[item]] <- full_data %>%
  group_by(question) %>%
  summarize_at(item, ~100*mean(.)) %>%
  arrange_at(item) %>%
  mutate(fill = "a") %>%
  bind_rows(summarise_at(., 2, mean) %>% mutate(question = "Overall", fill = "b")) %>%
  mutate(question = factor(question, question)) %>%
  ggplot() +
  geom_col(aes(
    question, 
    !!sym(item),
    fill = fill)) +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 30, hjust = 1)) +
  ggtitle(item) +
  labs(x = "", y = "% of users who scrolled to the bottom")
plots[[item]]

Times changed/modified search query

item <- "Times changed/modified search query"
if (names(full_data)[[6]] != item)
  stop("wrong order of columns")

table(full_data[[6]])
## 
##    0    1    2    3    4    5    7   11 
## 1539  193   39   13    9    3    2    1
plots[[item]] <- full_data %>%
  transmute_at(item, ~ifelse(.>1, "2+", .)) %>%
  group_by_at(item) %>%
  count() %>%
  ungroup() %>%
  mutate(n = n/sum(n)) %>%
  na.omit() %>%
  ggplot(aes(!!sym(item),n)) +
  geom_col() +
  ggtitle(item) +
  scale_y_continuous(labels = function(x) scales::percent(x,1)) +
  labs(x = "number of changes", y = "% of users")

plots[[item]]

Times searcher clicks a result, then bounces back to the search results page and chooses a different result

item <- "Times searcher clicks a result, then bounces back to the search results page and chooses a different result"
table(full_data[[7]])
## 
##    0    1    2    3    4    5    6    8    9   11 
## 1493  209   66   20    5    2    1    2    1    1
plots[[item]] <- full_data %>%
  transmute_at(item, ~ifelse(.>1, "2+", .)) %>%
  group_by_at(item) %>%
  count() %>%
  ungroup() %>%
  mutate(n = n/sum(n)) %>%
  na.omit() %>%
  ggplot(aes(!!sym(item),n)) +
  geom_col() +
  ggtitle(item) +
  scale_y_continuous(labels = function(x) scales::percent(x,1)) +
  labs(x = "number of clicks followed by bounces", y = "% of users")

plots[[item]]

Number of pages visited

item <- "Number of pages visited"
table(full_data[[item]])
## 
##    0    1    2    3    4    5    6    7    8    9   10   15 
##   87 1054  410  145   47   29   17    3    6    1    1    1
plots[[item]] <- full_data %>%
  transmute_at(item, ~ifelse(.>=4, "4+", .)) %>%
  group_by_at(item) %>%
  count() %>%
  ungroup() %>%
  mutate(n = n/sum(n)) %>%
  na.omit() %>%
  ggplot(aes(!!sym(item),n)) +
  geom_col() +
  ggtitle(item) +
  scale_y_continuous(labels = function(x) scales::percent(x,1)) +
  labs(x = "number of pages", y = "% of users")

plots[[item]]

Clicks on organic search results

item <- "Clicks on organic search results"
table(full_data[[item]])
## 
##   0   1   2   3   4   5   6   7   8   9  13 
## 625 875 175  70  24  19   7   3   1   1   1
plots[[item]] <-
  full_data %>%
  transmute_at(item, ~ifelse(.>=3, "3+", .)) %>%
  group_by_at(item) %>%
  count() %>%
  ungroup() %>%
  mutate(n = n/sum(n)) %>%
  na.omit() %>%
  ggplot(aes(!!sym(item),n)) +
  geom_col() +
  ggtitle(item) +
  scale_y_continuous(labels = function(x) scales::percent(x,1)) +
  labs(x = "number of clicks", y = "% of users")

plots[[item]]

Clicks on Google AdWords/paid listings

item <- "Clicks on Google AdWords/paid listings"
table(full_data[[item]])
## 
##    0    1    2    3    4    5    6    7   10 
## 1456  272   46   15    2    6    2    1    1
plots[[item]] <-
  full_data %>%
  transmute_at(item, ~ifelse(.>=2, "2+", .)) %>%
  group_by_at(item) %>%
  count() %>%
  ungroup() %>%
  mutate(n = n/sum(n)) %>%
  na.omit() %>%
  ggplot(aes(!!sym(item),n)) +
  geom_col() +
  ggtitle(item) +
  scale_y_continuous(labels = function(x) scales::percent(x,1)) +
  labs(x = "number of clicks", y = "% of users")

plots[[item]] 

Clicks on video results

item <- "Clicks on video results"
table(full_data[[item]])
## 
##    0    1    2    3    4    6 
## 1603  173   15    7    2    1
plots[[item]] <-
  full_data %>%
  transmute_at(item, ~ifelse(.>=2, "2+", .)) %>%
  group_by_at(item) %>%
  count() %>%
  ungroup() %>%
  mutate(n = n/sum(n)) %>%
  na.omit() %>%
  ggplot(aes(!!sym(item),n)) +
  geom_col() +
  ggtitle(item) +
  scale_y_continuous(labels = function(x) scales::percent(x,1)) +
  labs(x = "number of clicks", y = "% of users")

plots[[item]] 

Clicks on Google Maps/local listings

item <- "Clicks on Google Maps/local listings"
table(full_data[[item]])
## 
##    0    1    2    3    4    5 
## 1684   62   36   13    5    1
plots[[item]] <-
  full_data %>%
  transmute_at(item, ~ifelse(.>=2, "2+", .)) %>%
  group_by_at(item) %>%
  count() %>%
  ungroup() %>%
  mutate(n = n/sum(n)) %>%
  na.omit() %>%
  ggplot(aes(!!sym(item),n)) +
  geom_col() +
  ggtitle(item) +
  scale_y_continuous(labels = function(x) scales::percent(x,1)) +
  labs(x = "number of clicks", y = "% of users")

plots[[item]]

If clicked on Google Maps/local listing, did they click on one of the first three listings?

item <- "If clicked on Google Maps/local listing, did they click on one of the first three listings?"
table(full_data[[item]], full_data[["question"]], useNA = "always")
##        
##         Q1_Commercial_1_pyhsical Q2_Commercial_2_service Q3_Local
##   FALSE                        0                       0        3
##   TRUE                         0                       0      105
##   <NA>                       256                     257      150
##        
##         Q4_Informational_1_video Q5_Informational_2_indeepth
##   FALSE                        1                           0
##   TRUE                         0                           0
##   <NA>                       257                         259
##        
##         Q6_transactional Q7_Commercial_3_service <NA>
##   FALSE                0                       0    0
##   TRUE                 8                       0    0
##   <NA>               249                     256    0
full_data %>%
  group_by(question) %>%
  summarize(clickers = sum(`Clicks on Google Maps/local listings` != 0), first3 = 100*mean(!!sym(item), na.rm = TRUE))
## # A tibble: 7 x 3
##   question                    clickers first3
##   <chr>                          <int>  <dbl>
## 1 Q1_Commercial_1_pyhsical           0  NaN  
## 2 Q2_Commercial_2_service            0  NaN  
## 3 Q3_Local                         108   97.2
## 4 Q4_Informational_1_video           1    0  
## 5 Q5_Informational_2_indeepth        0  NaN  
## 6 Q6_transactional                   8  100  
## 7 Q7_Commercial_3_service            0  NaN
full_data %>%
  summarize(clickers = sum(`Clicks on Google Maps/local listings` != 0), first3 = 100*mean(!!sym(item), na.rm = TRUE))
## # A tibble: 1 x 2
##   clickers first3
##      <int>  <dbl>
## 1      117   96.6

We don’t have reliable numbers here for any conclusion

Clicks on Image blocks

item <- "Clicks on Image blocks"
table(full_data[[item]], useNA = "always")
## 
##    0    1    2    4 <NA> 
## 1792    5    2    1    1
plots[[item]] <-
  full_data %>%
  transmute_at(item, ~ifelse(.>=1, "1+", .)) %>%
  group_by_at(item) %>%
  count() %>%
  ungroup() %>%
  mutate(n = n/sum(n)) %>%
  na.omit() %>%
  ggplot(aes(!!sym(item),n)) +
  geom_col() +
  ggtitle(item) +
  scale_y_continuous(labels = function(x) scales::percent(x,1)) +
  labs(x = "number of clicks", y = "% of users")

plots[[item]]

Here let’s make sure that the instruction and measurement were ok because number is really low

Clicks on Google Shopping results

item <- "Clicks on Google Shopping results"
table(full_data[[item]], useNA = "always")
## 
##    0    1    2    3    5    6    9   14 <NA> 
## 1739   31   17   10    1    1    1    1    0
plots[[item]] <-
  full_data %>%
  transmute_at(item, ~ifelse(.>=1, "1+", .)) %>%
  group_by_at(item) %>%
  count() %>%
  ungroup() %>%
  mutate(n = n/sum(n)) %>%
  na.omit() %>%
  ggplot(aes(!!sym(item),n)) +
  geom_col() +
  ggtitle(item) +
  scale_y_continuous(labels = function(x) scales::percent(x,1)) +
  labs(x = "number of clicks", y = "% of users")

plots[[item]]

Clicks “people also ask” box

item <- "Clicks \"people also ask\" box"
table(full_data[[item]], useNA = "always")
## 
##    0    1    2    3    4    7   12 <NA> 
## 1748   29   15    5    2    1    1    0
plots[[item]] <-
  full_data %>%
  transmute_at(item, ~ifelse(.>=1, "1+", .)) %>%
  group_by_at(item) %>%
  count() %>%
  ungroup() %>%
  mutate(n = n/sum(n)) %>%
  na.omit() %>%
  ggplot(aes(!!sym(item),n)) +
  geom_col() +
  ggtitle(item) +
  scale_y_continuous(labels = function(x) scales::percent(x,1)) +
  labs(x = "number of clicks", y = "% of users")

plots[[item]]

Clicks to 2nd+ Google page

item <- "Clicks to 2nd+ Google page"
table(full_data[[item]], useNA = "always")
## 
##    0    1    2    3    4 <NA> 
## 1793    3    2    1    2    0
plots[[item]] <-
  full_data %>%
  transmute_at(item, ~ifelse(.>=1, "1+", .)) %>%
  group_by_at(item) %>%
  count() %>%
  ungroup() %>%
  mutate(n = n/sum(n)) %>%
  na.omit() %>%
  ggplot(aes(!!sym(item),n)) +
  geom_col() +
  ggtitle(item) +
  scale_y_continuous(labels = function(x) scales::percent(x,1)) +
  labs(x = "number of clicks", y = "% of users")

plots[[item]] 

Seconds to complete search task

item <- "Seconds to complete search task"

full_data %>%
  group_by_at(item) %>%
  summarize(n = n()) %>%
  ggplot(aes(!!sym(item), n)) +
  geom_point() +
  geom_line() +
  ggtitle("raw chart")

# quantile data with labels

plot_data <- full_data %>%
  group_by_at(item) %>%
  summarize(pct = n()/nrow(.)) %>%
  mutate_at("pct", cumsum) 

q_data = tibble(
    q = quantile(full_data[[item]], c(.25,.5,.75)), 
    x = approx(plot_data$pct, plot_data[[item]], xout = c(.25,.5,.75))$y,
    y = c(.25,.50,.75),
    lab = sprintf("%s%% complete before %s sec", y*100, q))


plots[[item]] <-
  plot_data %>%
  ggplot(aes(!!sym(item), pct))  + 
  geom_line() +
  coord_cartesian(xlim = c(0,300)) + 
  ggtitle("Time to complete a search task") +
  labs(x = "time (seconds)", y = "% of users who complete the task before time") +
  geom_point(aes(x, y), data =q_data, size = 4) +
  geom_text(aes(x, y, label = lab), data = q_data, hjust = 0, nudge_x = 3) +
  scale_y_continuous(labels = function(x) scales::percent(x,1)) 

plots[[item]]

Total Clicks

item <- "Total Clicks"
table(full_data[[item]], useNA = "always")
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
##    5   98  185  252  160  246  155  140  101   87   66   48   48   32   33 
##   15   16   17   18   19   20   21   22   23   24   25   26   27   28   29 
##   28   16   20    9   12    2    8    5    2    1    8    2    5    4    2 
##   30   31   32   33   34   35   36   37   38   41   60   86   94 <NA> 
##    3    2    2    1    3    2    1    2    1    1    1    1    1    0
# quantile data with labels


plot_data <- full_data %>%
  group_by_at(item) %>%
  summarize(pct = n()/nrow(.)) %>%
  mutate_at("pct", cumsum) 

q_data = tibble(
    q = quantile(full_data[[item]], c(.25,.5,.75)), 
    x = approx(plot_data$pct, plot_data[[item]], xout = c(.25,.5,.75))$y,
    y = c(.25,.50,.75),
    lab = sprintf("%s%% click less than %s times", y*100, q))

plots[[item]] <-
  plot_data %>%
  ggplot(aes(!!sym(item), pct))  + 
  geom_line() +
  #geom_point() +
  coord_cartesian(xlim = c(0,50)) + 
  ggtitle("Total number of clicks") +
  labs(x = "number of clicks", y = "% of users who clicked less than the given numeber of clicks") +
  geom_point(aes(x, y), data = q_data, size = 4) +
  geom_text(aes(x, y, label = lab), data = q_data, hjust = 0, nudge_x = 3) +
  scale_y_continuous(labels = function(x) scales::percent(x,1)) 

plots[[item]]

2. Save Results

saveRDS(plots, here::here("proc_data/plots.RDS"))